Covid-19 Community Score

library(ggpubr)
## Loading required package: ggplot2
## Loading required package: magrittr
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ tibble  2.1.3     ✔ purrr   0.3.4
## ✔ tidyr   1.1.0     ✔ dplyr   0.8.3
## ✔ readr   1.3.1     ✔ stringr 1.3.1
## ✔ tibble  2.1.3     ✔ forcats 0.3.0
## ── Conflicts ────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tidyr::extract()   masks magrittr::extract()
## ✖ dplyr::filter()    masks stats::filter()
## ✖ dplyr::lag()       masks stats::lag()
## ✖ purrr::set_names() masks magrittr::set_names()
library(readr)
library(gplots)
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
library(ggrepel)
library(ggthemes)
library(DT)
library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
## 
##     discard
## The following object is masked from 'package:readr':
## 
##     col_factor
library(corrr)
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
WRITE_OUTPUT <- FALSE
RUN_RF <- FALSE
source('data_prep_risk_score.R') ## 

Correlation between health indicators

“Established” co-morbids for COVID-19

cr_established <- cor(fh_acs_established, use='pairwise.complete.obs')

new_names <- c('Cancer', 'Arthritis', 'Stroke', 'Chronic Asthma', 'COPD', 'Heart Disease', 'Diabetes', 'Kidney Disease', 'BP Medication', 'Smoking', 'High BP', 'Obesity', 'High Cholesterol', 'Male Over 65', 'Female Over 65')

colnames(cr_established) <- new_names
rownames(cr_established) <- new_names
heatmapColors <- function(numColors=16) {
    c1 <- rainbow(numColors,v=seq(0.5,1,length=numColors),s=seq(1,0.3,length=numColors),start=4/6,end=4.0001/6);
    c2 <- rainbow(numColors,v=seq(0.5,1,length=numColors),s=seq(1,0.3,length=numColors),start=1/6,end=1.0001/6);
    c3 <- c(c1,rev(c2)); 
    return(c3)
}
heatmap.2(cr_established, trace='none',margins = c(10, 10), col=heatmapColors(8))

quantile(abs(cr_established[upper.tri(cr_established)]))
##        0%       25%       50%       75%      100% 
## 0.1166857 0.3467861 0.6331859 0.7840348 0.9621185
fh_acs_established_2 <- fh_acs_established
colnames(fh_acs_established_2) <- new_names
cr_established_tidy <- fh_acs_established_2 %>% correlate() %>% stretch()
## 
## Correlation method: 'pearson'
## Missing treated using: 'pairwise.complete.obs'
diabetes_related <- c('Heart Disease', 'Diabetes', 'Kidney Disease', 'Stroke')
risk_factors_diabetes <- c('High BP', 'Obesity', 'High Cholesterol')

                  
cr_established_tidy %>% filter(x %in% diabetes_related, y %in% diabetes_related) %>% summarize(quantile_low=quantile(r, na.rm = T, probs=.25), median=median(r, na.rm=T), quantile_high=quantile(r, na.rm=T, probs=.75), avg=mean(abs(r), na.rm=T))
cr_established_tidy %>% filter(x %in% risk_factors_diabetes, y %in% risk_factors_diabetes) %>% summarize(quantile_low=quantile(r, na.rm = T, probs=.25), median=median(r, na.rm=T), quantile_high=quantile(r, na.rm=T, probs=.75), avg=mean(abs(r), na.rm=T))
cr_established_tidy %>% filter(x %in% risk_factors_diabetes, y %in% diabetes_related) %>% summarize(quantile_low=quantile(r, na.rm = T, probs=.25), median=median(r, na.rm=T), quantile_high=quantile(r, na.rm=T, probs=.75), avg=mean(abs(r), na.rm=T))
cr_established_tidy %>% filter(x == 'Chronic Asthma', y == 'COPD')
cr_established_tidy %>% filter(x == 'Smoking', y == 'COPD')
cr_established_tidy %>% filter(x == 'Smoking', y == 'Chronic Asthma')
cr_established_tidy %>% filter(x == 'Obesity') %>% summarize(mean=mean(abs(r), na.rm=T))
cr_established_tidy %>% filter(x == 'Cancer', y %in% c('Male Over 65', 'Female Over 65'))  %>% summarize(mean=mean(abs(r), na.rm=T))
remove(fh_acs_established_2)
# number of tracts
num_tracts <- fh_acs[non_na, c("placename", "county_code", "stateabbr", "total_population")] 
num_tracts %>% group_by(placename, stateabbr) %>% summarize(number_of_tracts=n(), total_population=sum(total_population)) %>% arrange(desc(number_of_tracts))
num_tracts %>% group_by(placename, stateabbr) %>% summarize(number_of_tracts=n()) %>% ungroup() %>% summarise(median=median(number_of_tracts), low=min(number_of_tracts), hi=max(number_of_tracts), pct_25=quantile(number_of_tracts, probs=.25), pct_75=quantile(number_of_tracts, probs = .75),sd=sd(number_of_tracts)) %>% mutate(IQR = pct_75 -pct_25)
fh_acs_established_long <- fh_acs_established %>% cbind(fh_acs[non_na,"placename"])
names(fh_acs_established_long) <- c(new_names, "placename")
fh_acs_established_long <-  fh_acs_established_long %>% gather('disease', 'prevalence', -placename)

fh_acs_established_long <- fh_acs_established_long %>% mutate(disease = fct_reorder(disease, prevalence, .fun='median'))
prevalence_p <- ggplot(fh_acs_established_long, aes(disease, prevalence*100)) 
prevalence_p <- prevalence_p + geom_violin()
prevalence_p <- prevalence_p + xlab('') + ylab('Prevalence') + coord_flip() + theme_fivethirtyeight() + theme(axis.title = element_text())
prevalence_p

DT::datatable(fh_acs_established_long %>% group_by(disease) %>% summarize(median=median(prevalence), low=min(prevalence), hi=max(prevalence), pct_25=quantile(prevalence, probs=.25), pct_75=quantile(prevalence, probs = .75), sd=sd(prevalence)) %>% arrange(desc(median)))
DT::datatable(fh_acs_established_long %>% group_by(disease, placename) %>% summarise(median=median(prevalence), low=min(prevalence), hi=max(prevalence), pct_25=quantile(prevalence, probs=.25), pct_75=quantile(prevalence, probs = .75), sd=sd(prevalence)) %>% arrange(desc(sd)))
summ_prev_per_city <- fh_acs_established_long %>% group_by(disease, placename) %>% summarise(median=median(prevalence), low=min(prevalence), hi=max(prevalence), pct_25=quantile(prevalence, probs=.25), pct_75=quantile(prevalence, probs = .75),sd=sd(prevalence)) %>% mutate(IQR = pct_75 -pct_25)

## plot the IQR vs. median prevalence per city

summ_prev_top_iqrs <- summ_prev_per_city %>% group_by(disease) %>% top_n(3, IQR) %>% ungroup()
p <- ggplot(summ_prev_per_city, aes(median, IQR)) 
p <- p + geom_point() + facet_wrap(~disease, nrow=3)
p <- p + geom_text_repel(data=summ_prev_top_iqrs, aes(median, IQR, label=placename), color='red', size=3)
p <- p + geom_point(data=summ_prev_top_iqrs, aes(median, IQR), color='red')
p <- p + theme_fivethirtyeight() + theme(axis.title = element_text(), legend.position = 'none') + labs(x='Median Prevalence in a City', y='75th - 25th Percentile of Prevalence in a City')
p

PC of established COVID risk factors

pc_established <- prcomp(fh_acs_established,center=T, scale.=T)
summary(pc_established)
## Importance of components:
##                           PC1    PC2     PC3     PC4    PC5     PC6
## Standard deviation     3.0352 1.9013 0.74619 0.67495 0.5519 0.50395
## Proportion of Variance 0.6141 0.2410 0.03712 0.03037 0.0203 0.01693
## Cumulative Proportion  0.6141 0.8551 0.89227 0.92264 0.9429 0.95988
##                            PC7     PC8     PC9    PC10    PC11    PC12
## Standard deviation     0.40954 0.38578 0.32818 0.25263 0.19010 0.18311
## Proportion of Variance 0.01118 0.00992 0.00718 0.00425 0.00241 0.00224
## Cumulative Proportion  0.97106 0.98098 0.98816 0.99241 0.99482 0.99706
##                           PC13    PC14    PC15
## Standard deviation     0.13607 0.11964 0.10621
## Proportion of Variance 0.00123 0.00095 0.00075
## Cumulative Proportion  0.99829 0.99925 1.00000
pc_established$rotation [, c(1,2)]
##                           PC1         PC2
## CANCER_CrudePrev    0.1577109 -0.42669445
## ARTHRITIS_CrudePrev 0.3001464 -0.12969807
## STROKE_CrudePrev    0.3132249  0.09335877
## CASTHMA_CrudePrev   0.1995424  0.29981442
## COPD_CrudePrev      0.3030841  0.11966747
## CHD_CrudePrev       0.3162948 -0.04589079
## DIABETES_CrudePrev  0.2960469  0.12676442
## KIDNEY_CrudePrev    0.3074523  0.07334282
## BPMED_CrudePrev     0.2581304 -0.21748557
## CSMOKING_CrudePrev  0.2108770  0.35609321
## BPHIGH_CrudePrev    0.3168123  0.02448109
## OBESITY_CrudePrev   0.2365472  0.30528754
## HIGHCHOL_CrudePrev  0.2699629 -0.21002118
## male_over_65_pct    0.1161608 -0.42473566
## female_over_65_pct  0.1387121 -0.41499353

PC1 and 2 explain 90% of variation

fh_acs_established$pc_1 <- as.numeric(pc_established$x[, 1])
fh_acs_established$pc_2 <- as.numeric(pc_established$x[, 2])
placenames_cols <- c("stateabbr","placename" , "county_code", "state_code", "geoid","fips_place_tract","lat","long")
fh_acs_scoring <- fh_acs_established %>% cbind(fh_acs[non_na, placenames_cols])

fh_acs_scoring <- fh_acs_scoring %>% mutate(avg_over_65_pct = (male_over_65_pct+female_over_65_pct)/2)
fh_acs_scoring <- fh_acs_scoring %>% mutate(risk_score_pc_1=rescale(pc_1, to=c(0,100)))
fh_acs_scoring <- fh_acs_scoring %>% mutate(risk_score_pc_2=rescale(-pc_2, to=c(0,100))) # flip sign of pc2
fh_acs_scoring <- fh_acs_scoring %>% mutate(risk_score_both_pc=rescale(risk_score_pc_1*.61 + risk_score_pc_2*.24, to=c(0,100))) # rescale according the the contribution of the PC
fh_acs_scoring <- fh_acs_scoring %>% mutate(risk_score_all=rowSums(.[established_cols])) %>% mutate(risk_score_all=rescale(risk_score_all, to=c(0,100)))

## load in the errors 
comm_score_error <- read_rds('./scores/fh_acs_covid_comm_score_error.rds') # see error_simulation.R
fh_acs_scoring <- fh_acs_scoring %>% mutate(score_error=comm_score_error$score_error)
fh_acs_scoring %>% arrange(score_error) %>% dplyr::select(c(score_error, placename)) 

Examine PCs and association with an additive risk score

# show the top 5 tracts for the each axis
fh_acs_scoring_top <- fh_acs_scoring %>% top_n(10, risk_score_pc_1 ) %>%
  rbind(fh_acs_scoring %>% top_n(10, risk_score_pc_2 )) %>% unite(place, placename, stateabbr, sep=",")

p <- ggplot(fh_acs_scoring, aes(risk_score_pc_1, risk_score_pc_2)) 
p <- p + geom_point(alpha=0.5, color='gray')
p <- p + geom_point(data=fh_acs_scoring_top, aes(risk_score_pc_1, risk_score_pc_2), color='red')
p <- p + geom_text_repel(data=fh_acs_scoring_top, aes(risk_score_pc_1, risk_score_pc_2, label=place), color='red',size=3)
p <- p + theme_fivethirtyeight() + theme(axis.title = element_text(), legend.position = 'none') + labs(x='Component 1 (61%)', y='Component 2 (24%)')
p

## now get the variation per city for the risk score
summarized_risk_score_by_city <- fh_acs_scoring %>% group_by(placename) %>% summarise(stateabbr=first(stateabbr), median=median(risk_score_both_pc), low=min(risk_score_both_pc), hi=max(risk_score_both_pc), pct_25=quantile(risk_score_both_pc, probs=.25), pct_75=quantile(risk_score_both_pc, probs = .75),sd=sd(risk_score_both_pc)) %>% mutate(IQR = pct_75 -pct_25)

summarized_risk_score_by_city_top_iqr <- summarized_risk_score_by_city %>% top_n(10, IQR)
summarized_risk_score_by_city_top_iqr <- summarized_risk_score_by_city_top_iqr %>% rbind(summarized_risk_score_by_city %>% filter(median >= 40, IQR > 10),summarized_risk_score_by_city %>% filter(median >= 50)) %>% unite(place, placename, stateabbr, sep=",")

p <- ggplot(summarized_risk_score_by_city, aes(median, IQR)) 
p <- p+ geom_point(alpha=.5)
p <- p + geom_text_repel(data=summarized_risk_score_by_city_top_iqr, aes(median, IQR, label=place), color='red', size=3)
p <- p + geom_point(data=summarized_risk_score_by_city_top_iqr, aes(median, IQR), color='red')
p <- p +  theme_fivethirtyeight() + theme(axis.title = element_text(), legend.position = 'none') + labs(x = '% Median C19 Risk Score in a City', y = 'Difference in IQR of C19 Risk Score in a City') 
p

summarized_risk_score_by_city_top_iqr 

Correlation with Sociodemographic Factors

ses_vars_to_bind <- setdiff(ses_vars, names(fh_acs_scoring))
fh_all <- cbind(fh_acs_scoring, fh_acs[non_na, ses_vars_to_bind])

top_per_state <- fh_acs_scoring %>% group_by(stateabbr) %>% top_n(1,risk_score_both_pc) %>% ungroup() %>% filter(risk_score_both_pc >= 75)
# top census tracts per state based on age, but filtered for those that have a high risk score
oldest_tracts <- fh_acs_scoring %>% group_by(stateabbr) %>% top_n(1,avg_over_65_pct) %>% ungroup() %>% filter(avg_over_65_pct >= .50)
top_both <- fh_acs_scoring %>% filter(risk_score_both_pc >= 75 & avg_over_65_pct >= .50)
p <- ggplot(fh_acs_scoring, aes(I(100*avg_over_65_pct), risk_score_both_pc))
p <- p + geom_point(alpha=0.5, color='gray')
p <- p + geom_point(data=top_per_state, aes(I(100*avg_over_65_pct), risk_score_both_pc))
p <- p + geom_text_repel(data=top_per_state, aes(I(100*avg_over_65_pct), risk_score_both_pc, label=paste(placename, stateabbr)), size=3)
p <- p + geom_point(data=oldest_tracts, aes(I(100*avg_over_65_pct), risk_score_both_pc), color='red')
p <- p + geom_text_repel(data=oldest_tracts, aes(I(100*avg_over_65_pct), risk_score_both_pc, label=paste(placename, stateabbr)),color='red', size=3)
p <- p + theme_fivethirtyeight() + theme(axis.title = element_text(), legend.position = 'none') + labs(x = '% Age over 65', y = '')
p

top_score <- fh_all %>% group_by(stateabbr) %>% top_n(2, risk_score_both_pc) %>% filter(risk_score_both_pc >= 75)
p_income <- ggplot(fh_all, aes(median_income, risk_score_both_pc))
p_income <- p_income + geom_point(alpha=0.5, color='gray') 
p_income <- p_income + geom_point(data=top_score, aes(median_income, risk_score_both_pc))
p_income <- p_income + geom_text_repel(data=top_score, aes(median_income, risk_score_both_pc, label=paste(placename, stateabbr)),color='red', size=3)
p_income <- p_income + theme_fivethirtyeight() + theme(axis.title = element_text(), legend.position = 'none') + labs(x = 'Median Income per Tract', y = '')
p_income
## Warning: Removed 110 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_text_repel).

p_poverty <- ggplot(fh_all, aes(I(100*at_below_poverty_pct), risk_score_both_pc))
p_poverty <- p_poverty + geom_point(alpha=0.5, color='gray')
p_poverty <- p_poverty + geom_point(data=top_score, aes(I(100*at_below_poverty_pct), risk_score_both_pc))
p_poverty <- p_poverty + geom_text_repel(data=top_score, aes(I(100*at_below_poverty_pct), risk_score_both_pc, label=paste(placename, stateabbr)),color='red', size=3)

high_poverty <- fh_all %>% group_by(stateabbr) %>% top_n(1, at_below_poverty_pct) %>% filter(at_below_poverty_pct >= .60, risk_score_both_pc > 50)
p_poverty <- p_poverty + geom_point(data=high_poverty, aes(I(100*at_below_poverty_pct), risk_score_both_pc))
p_poverty <- p_poverty + geom_text_repel(data=high_poverty, aes(I(100*at_below_poverty_pct), risk_score_both_pc, label=paste(placename, stateabbr)),color='red', size=3)
p_poverty <- p_poverty + theme_fivethirtyeight() + theme(axis.title = element_text(), legend.position = 'none') + labs(x = '% Below Poverty per Tract', y = '')
p_poverty
## Warning: Removed 44 rows containing missing values (geom_point).

set.seed(123)
fh_all_train_ind <- sample(nrow(fh_all), size=floor(nrow(fh_all)/2))
train <- fh_all[fh_all_train_ind,]
test <- fh_all[setdiff(1:nrow(fh_all), fh_all_train_ind), ]

mod_linear <- lm(risk_score_both_pc ~ scale(median_income) + scale(median_home_value) + scale(at_below_poverty_pct) + scale(unemployment_pct) + scale(non_employment_pct) + scale(less_than_high_school_pct) + scale(no_health_insurance_pct) + scale(more_than_one_occupant_per_room_pct) + scale(african_american_pct) + scale(hispanic_pct) + scale(asian_pct) + scale(other_race_pct), data=train)

test_prediction_lm <- tibble(predicted=predict(mod_linear, test), actual=test$risk_score_both_pc)
predicted_cases_index <- complete.cases(test_prediction_lm)
test_prediction_lm <- test_prediction_lm[predicted_cases_index, ]
summary(lm(predicted ~ actual, test_prediction_lm))
## 
## Call:
## lm(formula = predicted ~ actual, data = test_prediction_lm)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -18.092  -2.605  -0.394   2.044  41.771 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 15.591745   0.156285   99.77   <2e-16 ***
## actual       0.542774   0.004482  121.09   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.228 on 13061 degrees of freedom
## Multiple R-squared:  0.5289, Adjusted R-squared:  0.5289 
## F-statistic: 1.466e+04 on 1 and 13061 DF,  p-value: < 2.2e-16
if(RUN_RF) {
  
mod_rf <- randomForest(risk_score_both_pc ~ median_income + unemployment_pct + median_home_value + at_below_poverty_pct + unemployment_pct + non_employment_pct + less_than_high_school_pct + no_health_insurance_pct + more_than_one_occupant_per_room_pct + african_american_pct + hispanic_pct + asian_pct + other_race_pct, data=train, na.action=na.roughfix, ntree=1000, importance=T)
  importance(mod_rf)
  test_prediction_rf <- tibble(predicted=predict(mod_rf, test), actual=test$risk_score_both_pc)
predicted_cases_index <- complete.cases(test_prediction_rf)
test_prediction_rf <- test_prediction_rf[predicted_cases_index, ]
summary(lm(predicted ~ actual, test_prediction_rf))

p <- ggplot(test_prediction_rf, aes(actual, predicted))
p <- p + geom_point(alpha=.1)
p

pred <- predict(mod_rf, fh_all)
fh_all <- fh_all %>% mutate(risk_score_both_pc_residual = risk_score_both_pc-pred)
  if(WRITE_OUTPUT) {
    fh_all_ses_residual <- fh_all %>% dplyr::select(fips_place_tract, geoid, placename, stateabbr, risk_score_both_pc, risk_score_both_pc_residual)

    write_rds(fh_all_ses_residual, path = "fh_acs_ses_residual_covid_comm_score.rds")
   write_csv(fh_all_ses_residual, path="fh_acs_ses_residual_covid_comm_score.csv")
  }
}

Census-tract Community Risk data dump

fh_acs_scoring <- fh_acs_scoring %>% cbind(fh_acs[non_na, c('total_males', 'total_females')])
fh_acs_scoring <- fh_acs_scoring %>% mutate(risk_score = risk_score_both_pc)
fh_acs_scoring <- fh_acs_scoring %>% group_split(placename) %>% map(function(.x) { .x %>% mutate(rank_in_city=rank(-risk_score)) }) %>% bind_rows()
fh_acs_scoring <- fh_acs_scoring %>% group_split(stateabbr) %>% map(function(.x) { .x %>% mutate(rank_in_state=rank(-risk_score)) }) %>% bind_rows()
if(WRITE_OUTPUT) {
  write_rds(fh_acs_scoring, path = "fh_acs_covid_comm_score.rds")
  write_csv(fh_acs_scoring, path="fh_acs_covid_comm_score.csv")
}

City COVID-19 Community Risk Score

fh_acs_scoring_city <- fh_acs_scoring %>% dplyr::select(-c(pc_1, pc_2, risk_score_pc_1, risk_score_pc_2, risk_score, risk_score_both_pc, risk_score_all, lat,long, geoid, fips_place_tract)) %>% unite(place_state, placename, stateabbr, sep="|")
fh_acs_scoring_city <- fh_acs_scoring_city %>% mutate(total_population = total_males + total_females)


cols_to_avg <- c(established_disease, established_risk)
cols_se <- c(established_disease_se, established_risk_se)

fh_acs_scoring_city <- fh_acs_scoring_city %>% cbind(fh_acs[non_na, c(established_disease_se, established_risk_se)])

cols_to_total <- c("male_over_65_pct","female_over_65_pct","avg_over_65_pct")
cols_total <- c("male_over_65_count","female_over_65_count","avg_over_65_count")

for(i in 1:length(cols_to_total)) {
  fh_acs_scoring_city <- fh_acs_scoring_city %>% mutate(!!cols_total[i] := .data[[cols_to_total[i]]]*total_population)
}
fh_acs_scoring_by_city <- fh_acs_scoring_city %>% group_by(place_state) %>% summarise_at(c(cols_total, 'total_population'), sum)

cols_to_prev <- c("male_over_65_pct","female_over_65_pct","avg_over_65_pct")
for(i in 1:length(cols_total)) {
  fh_acs_scoring_by_city <- fh_acs_scoring_by_city %>% mutate(!!cols_to_prev[i] := .data[[cols_total[i]]]/total_population)
}


## now take weighted prevalence average
weighted_prev_avg <- function(d, prefix_col) { 
  # d is grouped by frame for a city or state
  prev_col <- sprintf('%s_CrudePrev', prefix_col)
  se_col <- sprintf('%s_SE', prefix_col)
  ind <- complete.cases(d[, c(prev_col, se_col)])
  d <- d[ind,]
  weighted.mean(d[[prev_col]], 1/(d[[se_col]]+0.0005)) # note the fudge factor
}


weighted_prev_avgs <- function(d, prefix_cols) {
  avgs <- prefix_cols %>% map_dbl(~weighted_prev_avg(d, .x))
  place<- d[['place_state']][1]
  tibble(disease=prefix_cols, prevalence=avgs, place_state=place)
}

prefixes <- c("CANCER", "ARTHRITIS", "STROKE","CASTHMA", "COPD", "CHD", "DIABETES","KIDNEY", "BPMED", "CSMOKING", "BPHIGH","OBESITY","HIGHCHOL" )

fh_acs_scoring_prevs_by_city <- fh_acs_scoring_city %>% group_by(place_state) %>% group_split(keep=TRUE) %>% map_df(~weighted_prev_avgs(.x, prefixes))
fh_acs_scoring_prevs_by_city$disease <- sprintf('%s_CrudePrev', fh_acs_scoring_prevs_by_city$disease)
fh_acs_scoring_prevs_by_city <- fh_acs_scoring_prevs_by_city %>% spread(disease, prevalence)

fh_acs_scoring_by_city <- fh_acs_scoring_by_city %>% left_join(fh_acs_scoring_prevs_by_city, by='place_state')


pc_by_city <- prcomp(fh_acs_scoring_by_city[,established_cols],center=T, scale.=T)
summary(pc_by_city)
## Importance of components:
##                           PC1    PC2     PC3     PC4     PC5    PC6
## Standard deviation     3.0342 1.7703 0.97630 0.71648 0.53455 0.5020
## Proportion of Variance 0.6138 0.2089 0.06354 0.03422 0.01905 0.0168
## Cumulative Proportion  0.6138 0.8227 0.88624 0.92046 0.93951 0.9563
##                            PC7     PC8    PC9    PC10    PC11    PC12
## Standard deviation     0.42483 0.37164 0.3464 0.25315 0.22144 0.19245
## Proportion of Variance 0.01203 0.00921 0.0080 0.00427 0.00327 0.00247
## Cumulative Proportion  0.96834 0.97755 0.9856 0.98982 0.99309 0.99556
##                           PC13    PC14    PC15
## Standard deviation     0.17249 0.14290 0.12805
## Proportion of Variance 0.00198 0.00136 0.00109
## Cumulative Proportion  0.99755 0.99891 1.00000
pc_by_city$rotation
##                            PC1         PC2         PC3         PC4
## CANCER_CrudePrev    -0.1513625 -0.46234141  0.22821155 -0.11472234
## ARTHRITIS_CrudePrev -0.2940666 -0.10042677  0.32428060 -0.15763331
## STROKE_CrudePrev    -0.3107515  0.09672029 -0.09092279  0.25688009
## CASTHMA_CrudePrev   -0.1934841  0.21545877  0.63368651  0.23624056
## COPD_CrudePrev      -0.3097551  0.05194417  0.15298565  0.07532731
## CHD_CrudePrev       -0.3175390 -0.03559995 -0.07979953  0.14232241
## DIABETES_CrudePrev  -0.2680392  0.19977554 -0.41492573  0.15166847
## KIDNEY_CrudePrev    -0.2902240  0.10601242 -0.23259744  0.45053133
## BPMED_CrudePrev     -0.2651378 -0.14699842 -0.08255760 -0.53277744
## CSMOKING_CrudePrev  -0.2626526  0.24489948  0.24498121 -0.12100572
## BPHIGH_CrudePrev    -0.3058207  0.07837667 -0.12820089 -0.16401360
## OBESITY_CrudePrev   -0.2378426  0.31547582  0.02046824 -0.30149445
## HIGHCHOL_CrudePrev  -0.2606948 -0.15744218 -0.30352090 -0.27935332
## male_over_65_pct    -0.1434706 -0.48444226 -0.01096777  0.21290136
## female_over_65_pct  -0.1607230 -0.46748113  0.03110201  0.22050634
##                             PC5         PC6         PC7         PC8
## CANCER_CrudePrev     0.08392488 -0.07347842  0.25553415 -0.46669281
## ARTHRITIS_CrudePrev  0.10658331 -0.10064919  0.22599308 -0.12733324
## STROKE_CrudePrev    -0.11393147  0.10683646 -0.13890118 -0.14884583
## CASTHMA_CrudePrev   -0.34373332 -0.47370433 -0.04433591  0.13814565
## COPD_CrudePrev       0.33440598  0.16857389 -0.26693587 -0.07595834
## CHD_CrudePrev        0.27360824  0.12780107  0.02874123 -0.28674742
## DIABETES_CrudePrev  -0.18302466 -0.09581671  0.09538506 -0.02628227
## KIDNEY_CrudePrev    -0.08977368 -0.09208268  0.08173772 -0.22918005
## BPMED_CrudePrev     -0.55368038  0.12104692 -0.36420795 -0.23807871
## CSMOKING_CrudePrev   0.31811780  0.34094732 -0.35361984  0.23671445
## BPHIGH_CrudePrev    -0.20110174 -0.01062993  0.01577419  0.32652656
## OBESITY_CrudePrev   -0.01310907  0.24021625  0.70054193  0.16737612
## HIGHCHOL_CrudePrev   0.38921737 -0.65747628 -0.10373285  0.24712102
## male_over_65_pct    -0.06690911  0.19134865  0.12225284  0.38076066
## female_over_65_pct  -0.13370052  0.16925387 -0.01007635  0.36074883
##                              PC9        PC10        PC11        PC12
## CANCER_CrudePrev     0.072870493 -0.33914832  0.27547172 -0.23406289
## ARTHRITIS_CrudePrev  0.241497931  0.67528027  0.12988256  0.28288316
## STROKE_CrudePrev     0.212762616 -0.23073121  0.25143581  0.17967364
## CASTHMA_CrudePrev   -0.145995946 -0.10206475 -0.12926353 -0.06412544
## COPD_CrudePrev       0.219679729 -0.10294665 -0.65236875 -0.05495016
## CHD_CrudePrev       -0.099853188  0.05922997 -0.18073711 -0.13243167
## DIABETES_CrudePrev   0.005254134  0.43398080 -0.01451853 -0.22891985
## KIDNEY_CrudePrev    -0.265574072 -0.13103901  0.16133076  0.17389213
## BPMED_CrudePrev     -0.261258660  0.02080074 -0.15209884  0.09552718
## CSMOKING_CrudePrev  -0.276916937  0.02510667  0.50793699 -0.02121700
## BPHIGH_CrudePrev     0.681418713 -0.23231460  0.14031844 -0.10110595
## OBESITY_CrudePrev   -0.259482972 -0.20680412 -0.16221918 -0.04383570
## HIGHCHOL_CrudePrev  -0.191505842 -0.11908380  0.00315688  0.03989486
## male_over_65_pct    -0.087832405 -0.06557378 -0.12377680  0.59321163
## female_over_65_pct  -0.141449238  0.17152905  0.01893415 -0.59193355
##                             PC13        PC14        PC15
## CANCER_CrudePrev     0.301953414  0.08927087  0.22890783
## ARTHRITIS_CrudePrev -0.237040114 -0.12640264  0.03568674
## STROKE_CrudePrev    -0.486543461  0.56781851  0.02883620
## CASTHMA_CrudePrev    0.127599997  0.10395957 -0.12462931
## COPD_CrudePrev      -0.003373466 -0.07894889  0.40162146
## CHD_CrudePrev        0.122546485  0.04612681 -0.78498727
## DIABETES_CrudePrev   0.470687683  0.32485392  0.26935319
## KIDNEY_CrudePrev    -0.092139236 -0.62623112  0.16457916
## BPMED_CrudePrev     -0.034577843 -0.06229959 -0.02928294
## CSMOKING_CrudePrev   0.236746526 -0.02143721  0.07471927
## BPHIGH_CrudePrev     0.157695741 -0.31052596 -0.20015988
## OBESITY_CrudePrev   -0.165828195  0.07669304  0.06497547
## HIGHCHOL_CrudePrev  -0.145504597  0.08690142  0.01766287
## male_over_65_pct     0.318050339  0.13286900 -0.01495440
## female_over_65_pct  -0.350178782 -0.07011795  0.03949111
fh_acs_scoring_by_city$pc_1 <- as.numeric(pc_by_city$x[, 1])
fh_acs_scoring_by_city$pc_2 <- as.numeric(pc_by_city$x[, 2])

fh_acs_scoring_by_city <- fh_acs_scoring_by_city %>% mutate(risk_score_pc_1=as.numeric(rescale(-pc_1, to=c(0,100)))) # flip the sign
fh_acs_scoring_by_city <- fh_acs_scoring_by_city %>% mutate(risk_score_pc_2=as.numeric(rescale(-pc_2, to=c(0,100))))
fh_acs_scoring_by_city <- fh_acs_scoring_by_city %>% mutate(risk_score_both_pc=rescale(risk_score_pc_1*.61 + risk_score_pc_2*.24, to=c(0,100))) # rescale according the the contribution of the PC
fh_acs_scoring_by_city <- fh_acs_scoring_by_city %>% mutate(risk_score_all=rowSums(.[established_cols])) %>% mutate(risk_score_all=as.numeric(rescale(risk_score_all, to=c(0,100))))

plot(fh_acs_scoring_by_city$risk_score_pc_1, fh_acs_scoring_by_city$risk_score_all)

plot(fh_acs_scoring_by_city$risk_score_pc_2, fh_acs_scoring_by_city$risk_score_all)

# add stateabbr
plot(fh_acs_scoring_by_city$risk_score_both_pc, fh_acs_scoring_by_city$avg_over_65_pct)

plot(fh_acs_scoring_by_city$risk_score_both_pc, fh_acs_scoring_by_city$DIABETES_CrudePrev)

fh_acs_scoring_by_city <- fh_acs_scoring_by_city %>% separate(place_state, c("placename", "stateabbr"), sep="\\|")
  

top_per_state <- fh_acs_scoring_by_city %>% group_by(stateabbr) %>% top_n(5,risk_score_pc_1) %>% ungroup() 
DT::datatable(top_per_state %>% dplyr::select(stateabbr, placename, risk_score_pc_1, risk_score_both_pc, risk_score_all))
fh_acs_scoring_by_city_distribute <- fh_acs_scoring_by_city %>% dplyr::select(c(established_cols, "placename", "stateabbr", "risk_score_both_pc", "risk_score_pc_1", "risk_score_pc_2", "risk_score_all", "total_population")) 
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(established_cols)` instead of `established_cols` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
fh_acs_scoring_by_city_distribute <- fh_acs_scoring_by_city_distribute %>% mutate(risk_score = risk_score_both_pc)
fh_acs_scoring_by_city_distribute <- fh_acs_scoring_by_city_distribute %>% group_split(stateabbr) %>% map(function(.x) { .x %>% mutate(rank_in_state=rank(-risk_score)) }) %>% bind_rows()
if(WRITE_OUTPUT) {
  write_rds(fh_acs_scoring_by_city, path = "fh_acs_covid_city_comm_score.rds")
  write_csv(fh_acs_scoring_by_city_distribute, path="fh_acs_covid_city_comm_score.csv")
}

County COVID-19 Community Risk Score

fh_acs_scoring_county <- fh_acs_scoring %>% dplyr::select(-c(pc_1, pc_2, risk_score_pc_1, risk_score_pc_2, risk_score_all, risk_score_both_pc, lat,long, geoid, fips_place_tract)) %>% mutate(total_population = total_males + total_females)
location_info <- fh_acs_scoring %>% dplyr::select(county_code, stateabbr) %>% unique() # go up a hierarchy
fh_acs_scoring_county <- fh_acs_scoring_county %>% cbind(fh_acs[non_na, c(established_disease_se, established_risk_se)])

county_counts <- fh_acs_scoring_county %>% group_by(county_code) %>% summarize(n=n())

fh_acs_scoring_county <- fh_acs_scoring_county %>% left_join(county_counts) %>% filter(n > 1)
## Joining, by = "county_code"
for(i in 1:length(cols_to_total)) {
  fh_acs_scoring_county <- fh_acs_scoring_county %>% mutate(!!cols_total[i] := .data[[cols_to_total[i]]]*total_population)
}
fh_acs_scoring_by_county <- fh_acs_scoring_county %>% group_by(county_code) %>% summarise_at(c(cols_total, 'total_population'), sum)

for(i in 1:length(cols_total)) {
  fh_acs_scoring_by_county <- fh_acs_scoring_by_county %>% mutate(!!cols_to_prev[i] := .data[[cols_total[i]]]/total_population)
}

weighted_county_code_prev_avgs <- function(d, prefix_cols) {
  avgs <- prefix_cols %>% map_dbl(~weighted_prev_avg(d, .x))
  place<- d[['county_code']][1]
  tibble(disease=prefix_cols, prevalence=avgs, county_code=place)
}

fh_acs_scoring_prevs_by_county <- fh_acs_scoring_county %>% group_by(county_code) %>% group_split(keep=TRUE) %>% map_df(~weighted_county_code_prev_avgs(.x, prefixes))
fh_acs_scoring_prevs_by_county$disease <- sprintf('%s_CrudePrev', fh_acs_scoring_prevs_by_county$disease)
fh_acs_scoring_prevs_by_county <- fh_acs_scoring_prevs_by_county %>% spread(disease, prevalence)

fh_acs_scoring_by_county <- fh_acs_scoring_by_county %>% left_join(fh_acs_scoring_prevs_by_county, by='county_code')

heatmap.2(cor(fh_acs_scoring_by_county[,established_cols]))

pc_by_county <- prcomp(fh_acs_scoring_by_county[,established_cols],center=T, scale.=T)
summary(pc_by_county)
## Importance of components:
##                           PC1    PC2     PC3     PC4     PC5     PC6
## Standard deviation     3.0683 1.6631 0.93637 0.74657 0.56938 0.55327
## Proportion of Variance 0.6276 0.1844 0.05845 0.03716 0.02161 0.02041
## Cumulative Proportion  0.6276 0.8120 0.87048 0.90763 0.92925 0.94965
##                            PC7     PC8     PC9    PC10   PC11    PC12
## Standard deviation     0.47853 0.40564 0.33632 0.27247 0.2479 0.21505
## Proportion of Variance 0.01527 0.01097 0.00754 0.00495 0.0041 0.00308
## Cumulative Proportion  0.96492 0.97589 0.98343 0.98838 0.9925 0.99556
##                           PC13    PC14    PC15
## Standard deviation     0.17598 0.14248 0.12373
## Proportion of Variance 0.00206 0.00135 0.00102
## Cumulative Proportion  0.99763 0.99898 1.00000
pc_by_county$rotation
##                            PC1         PC2         PC3         PC4
## CANCER_CrudePrev    -0.1666402 -0.46710898 -0.12727428  0.23758525
## ARTHRITIS_CrudePrev -0.2902344 -0.10349410 -0.27079042  0.30362465
## STROKE_CrudePrev    -0.3065678  0.10300399  0.03877146 -0.30512916
## CASTHMA_CrudePrev   -0.1906044  0.19933650 -0.71060290 -0.12545798
## COPD_CrudePrev      -0.3050213  0.05025504 -0.16770384 -0.02522198
## CHD_CrudePrev       -0.3123990 -0.03988041  0.06446970 -0.13628887
## DIABETES_CrudePrev  -0.2784138  0.19377115  0.32685305 -0.25976200
## KIDNEY_CrudePrev    -0.2920983  0.09514468  0.14000965 -0.46110621
## BPMED_CrudePrev     -0.2712243 -0.08809253  0.19144072  0.34971923
## CSMOKING_CrudePrev  -0.2553653  0.24179824 -0.25999076  0.15589505
## BPHIGH_CrudePrev    -0.2972654  0.10760894  0.17220335  0.08043558
## OBESITY_CrudePrev   -0.2349413  0.30978832  0.09604647  0.32432325
## HIGHCHOL_CrudePrev  -0.2688750 -0.13348753  0.30635232  0.30006038
## male_over_65_pct    -0.1427766 -0.50051949 -0.03218733 -0.21389231
## female_over_65_pct  -0.1708718 -0.47741351 -0.08535878 -0.22173714
##                              PC5         PC6         PC7         PC8
## CANCER_CrudePrev    -0.217197418  0.26148317 -0.10081538  0.42762361
## ARTHRITIS_CrudePrev -0.113200438  0.09646505  0.16711323  0.06266107
## STROKE_CrudePrev    -0.009377157 -0.01500191 -0.15709016  0.09305968
## CASTHMA_CrudePrev   -0.434083544 -0.27028419  0.13168005 -0.07095784
## COPD_CrudePrev       0.209145940  0.28647467 -0.13146871 -0.22161814
## CHD_CrudePrev        0.051017308  0.34623163 -0.11330264  0.20334670
## DIABETES_CrudePrev  -0.137591318 -0.08406118  0.06437783  0.03313377
## KIDNEY_CrudePrev    -0.172878229  0.09849564 -0.04250791  0.21406198
## BPMED_CrudePrev     -0.097418752 -0.56310734 -0.58525826  0.09706909
## CSMOKING_CrudePrev   0.527681173  0.16012193 -0.30666530 -0.21809934
## BPHIGH_CrudePrev    -0.027674320 -0.27840922  0.20824528 -0.35326231
## OBESITY_CrudePrev    0.309981549 -0.13935792  0.52767195  0.49132355
## HIGHCHOL_CrudePrev  -0.344718169  0.30091911  0.21312741 -0.46981273
## male_over_65_pct     0.324901371 -0.23090688  0.26346865 -0.12385887
## female_over_65_pct   0.217135134 -0.21633114  0.10298491 -0.05370117
##                              PC9        PC10         PC11         PC12
## CANCER_CrudePrev    -0.160096403  0.42065782 -0.090210874  0.065688156
## ARTHRITIS_CrudePrev -0.262912439 -0.61250516 -0.385182227 -0.050418232
## STROKE_CrudePrev    -0.176366306  0.26316364 -0.147829009 -0.207427780
## CASTHMA_CrudePrev    0.212887732  0.09933891  0.146680817 -0.020580787
## COPD_CrudePrev      -0.283817815 -0.10392786  0.681535675  0.049684335
## CHD_CrudePrev       -0.001090552 -0.16300058  0.143841559  0.012083721
## DIABETES_CrudePrev   0.029372901 -0.35102120 -0.125031911  0.248568010
## KIDNEY_CrudePrev     0.185609722  0.08170092 -0.116066313 -0.140859362
## BPMED_CrudePrev      0.127861127 -0.12190631  0.179796674 -0.096108903
## CSMOKING_CrudePrev   0.290501158  0.16103908 -0.430378527  0.021049794
## BPHIGH_CrudePrev    -0.580849785  0.35365074 -0.149304675  0.110814705
## OBESITY_CrudePrev    0.176941340  0.12613164  0.206371197  0.006478071
## HIGHCHOL_CrudePrev   0.446212140  0.11483896  0.053395650 -0.074216004
## male_over_65_pct     0.061794076 -0.07979618  0.023375429 -0.607072615
## female_over_65_pct   0.201480946  0.01310754 -0.003369048  0.685130788
##                            PC13        PC14         PC15
## CANCER_CrudePrev    -0.29982048  0.20020110  0.166767159
## ARTHRITIS_CrudePrev  0.29137577 -0.01775561  0.071809041
## STROKE_CrudePrev     0.45430470  0.45389030 -0.435315813
## CASTHMA_CrudePrev   -0.17126174 -0.02650641 -0.110546878
## COPD_CrudePrev       0.06989649  0.19403670  0.287434518
## CHD_CrudePrev       -0.24054627 -0.52407160 -0.567008450
## DIABETES_CrudePrev  -0.52494052  0.44582495  0.059742263
## KIDNEY_CrudePrev     0.22905717 -0.34963881  0.580545810
## BPMED_CrudePrev      0.04540168 -0.07655242  0.022340101
## CSMOKING_CrudePrev  -0.18419399  0.01636916  0.092399450
## BPHIGH_CrudePrev    -0.12971666 -0.32045102  0.030812750
## OBESITY_CrudePrev    0.08756811  0.03935685 -0.003459127
## HIGHCHOL_CrudePrev   0.12946752  0.07607271 -0.085251772
## male_over_65_pct    -0.23730476  0.05099024  0.022145883
## female_over_65_pct   0.25524112 -0.02870006 -0.053249529
fh_acs_scoring_by_county$pc_1 <- as.numeric(pc_by_county$x[, 1])
fh_acs_scoring_by_county$pc_2 <- as.numeric(pc_by_county$x[, 2])


fh_acs_scoring_by_county <- fh_acs_scoring_by_county %>% mutate(risk_score_pc_1=as.numeric(rescale(-pc_1,to=c(0,100)))) 
fh_acs_scoring_by_county <- fh_acs_scoring_by_county %>% mutate(risk_score_pc_2=as.numeric(rescale(-pc_2,to=c(0,100))))
fh_acs_scoring_by_county <- fh_acs_scoring_by_county %>% mutate(risk_score_both_pc=rescale(risk_score_pc_1*.61 + risk_score_pc_2*.24, to=c(0,100))) # rescale according the the contribution of the PC
fh_acs_scoring_by_county <- fh_acs_scoring_by_county %>% mutate(risk_score_all=rowSums(.[established_cols])) %>% mutate(risk_score_all=as.numeric(rescale(risk_score_all, to=c(0,100))))

plot(fh_acs_scoring_by_county$risk_score_pc_1, fh_acs_scoring_by_county$risk_score_all)

plot(fh_acs_scoring_by_county$risk_score_pc_2, fh_acs_scoring_by_county$risk_score_all)

fh_acs_scoring_by_county_distribute <- fh_acs_scoring_by_county %>% dplyr::select(c(established_cols, "county_code", "risk_score_both_pc", "risk_score_pc_1", "risk_score_pc_2", "risk_score_all", "total_population")) 

fh_acs_scoring_by_county_distribute <- fh_acs_scoring_by_county_distribute %>% mutate(risk_score = risk_score_both_pc)
fh_acs_scoring_by_county_distribute <- fh_acs_scoring_by_county_distribute %>% left_join(location_info)
## Joining, by = "county_code"
fh_acs_scoring_by_county_distribute <- fh_acs_scoring_by_county_distribute %>% group_split(stateabbr) %>% map(function(.x) { .x %>% mutate(rank_in_state=rank(-risk_score)) }) %>% bind_rows()

DT::datatable(fh_acs_scoring_by_county_distribute %>% dplyr::select(county_code, stateabbr, risk_score, risk_score_pc_1, risk_score_all))
if(WRITE_OUTPUT) {
  write_rds(fh_acs_scoring_by_county, path = "fh_acs_covid_county_comm_score.rds")
  write_csv(fh_acs_scoring_by_county_distribute, path="fh_acs_covid_county_comm_score.csv")
}

State COVID-19 Community Risk Score

fh_acs_scoring_state <- fh_acs_scoring %>% dplyr::select(-c(pc_1, pc_2, risk_score_pc_1, risk_score_pc_2, risk_score_all, risk_score_both_pc, lat,long, geoid, fips_place_tract)) %>% mutate(total_population = total_males + total_females)

fh_acs_scoring_state <- fh_acs_scoring_state %>% cbind(fh_acs[non_na, c(established_disease_se, established_risk_se)])

cols_to_total <- c("male_over_65_pct","female_over_65_pct","avg_over_65_pct")
cols_total <- c("male_over_65_count","female_over_65_count","avg_over_65_count")

for(i in 1:length(cols_to_total)) {
  fh_acs_scoring_state <- fh_acs_scoring_state %>% mutate(!!cols_total[i] := .data[[cols_to_total[i]]]*total_population)
}
fh_acs_scoring_by_state <- fh_acs_scoring_state %>% group_by(stateabbr) %>% summarise_at(c(cols_total, 'total_population'), sum)

cols_to_prev <- c("male_over_65_pct","female_over_65_pct","avg_over_65_pct")
for(i in 1:length(cols_total)) {
  fh_acs_scoring_by_state <- fh_acs_scoring_by_state %>% mutate(!!cols_to_prev[i] := .data[[cols_total[i]]]/total_population)
}

weighted_state_prev_avgs <- function(d, prefix_cols) {
  avgs <- prefix_cols %>% map_dbl(~weighted_prev_avg(d, .x))
  place<- d[['stateabbr']][1]
  tibble(disease=prefix_cols, prevalence=avgs, stateabbr=place)
}

fh_acs_scoring_prevs_by_state <- fh_acs_scoring_state %>% group_by(stateabbr) %>% group_split(keep=TRUE) %>% map_df(~weighted_state_prev_avgs(.x, prefixes))
fh_acs_scoring_prevs_by_state$disease <- sprintf('%s_CrudePrev', fh_acs_scoring_prevs_by_state$disease)
fh_acs_scoring_prevs_by_state <- fh_acs_scoring_prevs_by_state %>% spread(disease, prevalence)

fh_acs_scoring_by_state <- fh_acs_scoring_by_state %>% left_join(fh_acs_scoring_prevs_by_state, by='stateabbr')

heatmap.2(cor(fh_acs_scoring_by_state[,established_cols]))

pc_by_state <- prcomp(fh_acs_scoring_by_state[,established_cols],center=T, scale.=T)
summary(pc_by_state)
## Importance of components:
##                           PC1    PC2     PC3     PC4     PC5     PC6
## Standard deviation     3.0624 1.6261 0.93665 0.84338 0.60304 0.55470
## Proportion of Variance 0.6252 0.1763 0.05849 0.04742 0.02424 0.02051
## Cumulative Proportion  0.6252 0.8015 0.85998 0.90740 0.93164 0.95216
##                            PC7     PC8     PC9    PC10    PC11    PC12
## Standard deviation     0.42119 0.40141 0.35899 0.26226 0.24022 0.20958
## Proportion of Variance 0.01183 0.01074 0.00859 0.00459 0.00385 0.00293
## Cumulative Proportion  0.96398 0.97473 0.98332 0.98790 0.99175 0.99468
##                           PC13    PC14   PC15
## Standard deviation     0.18279 0.15936 0.1450
## Proportion of Variance 0.00223 0.00169 0.0014
## Cumulative Proportion  0.99691 0.99860 1.0000
pc_by_state$rotation
##                            PC1         PC2         PC3          PC4
## CANCER_CrudePrev    0.10849184 -0.48687692  0.31398024 -0.411886788
## ARTHRITIS_CrudePrev 0.28762056 -0.17192838  0.25955907 -0.197225365
## STROKE_CrudePrev    0.29568787  0.16693564 -0.03297050  0.230510634
## CASTHMA_CrudePrev   0.20051809  0.05812432  0.69740669  0.425880543
## COPD_CrudePrev      0.30897788 -0.02911972  0.16850702 -0.046970195
## CHD_CrudePrev       0.30775724 -0.05746680  0.06694467 -0.135046388
## DIABETES_CrudePrev  0.29745592  0.16297635 -0.20301709  0.171436753
## KIDNEY_CrudePrev    0.28920461  0.13182034 -0.09141990  0.335045426
## BPMED_CrudePrev     0.27306413 -0.03869105 -0.27696012  0.006104727
## CSMOKING_CrudePrev  0.28252688  0.14581369  0.07492955 -0.172516093
## BPHIGH_CrudePrev    0.30551845  0.09775451 -0.17117986  0.014660662
## OBESITY_CrudePrev   0.26438155  0.23866718 -0.01158984 -0.344739791
## HIGHCHOL_CrudePrev  0.26508121 -0.07842503 -0.28536429 -0.290631223
## male_over_65_pct    0.09406325 -0.54440342 -0.23571636  0.238328022
## female_over_65_pct  0.13719840 -0.51210848 -0.12401379  0.323614019
##                              PC5         PC6          PC7         PC8
## CANCER_CrudePrev    -0.092800184  0.17138413 -0.036601197  0.33053106
## ARTHRITIS_CrudePrev  0.122650282 -0.12151074 -0.071701223  0.12127297
## STROKE_CrudePrev    -0.115817383  0.29426838 -0.119429075  0.13270871
## CASTHMA_CrudePrev    0.120709997 -0.37897765  0.038021789 -0.06411735
## COPD_CrudePrev       0.074447044  0.15528084  0.254328731 -0.16032606
## CHD_CrudePrev       -0.002075941  0.39780839  0.198677079  0.09173957
## DIABETES_CrudePrev   0.168051107  0.06790125  0.008225568  0.16780350
## KIDNEY_CrudePrev     0.093091476  0.33141343 -0.013481235  0.26613298
## BPMED_CrudePrev     -0.544064878 -0.47452712  0.436704927  0.32519894
## CSMOKING_CrudePrev  -0.434077909  0.15799066  0.072228542 -0.70237792
## BPHIGH_CrudePrev     0.129793475 -0.23110414 -0.277026208 -0.09955023
## OBESITY_CrudePrev   -0.125653438 -0.21319431 -0.650209620  0.11167196
## HIGHCHOL_CrudePrev   0.618208470 -0.21857354  0.277584039 -0.19754738
## male_over_65_pct    -0.041764415  0.09842596 -0.299529602 -0.18383121
## female_over_65_pct  -0.070108924 -0.14958462 -0.105783421 -0.16408911
##                             PC9        PC10         PC11        PC12
## CANCER_CrudePrev    -0.33847058  0.27385133 -0.022327365  0.33731906
## ARTHRITIS_CrudePrev  0.22244333 -0.60346181  0.406571091 -0.22591943
## STROKE_CrudePrev    -0.16795671  0.08438731  0.529350080  0.07680340
## CASTHMA_CrudePrev   -0.19024150  0.07708237 -0.090885615 -0.03016615
## COPD_CrudePrev       0.43576304  0.43458153 -0.231497062 -0.26849595
## CHD_CrudePrev        0.32665747 -0.06317301 -0.062038440 -0.04336513
## DIABETES_CrudePrev   0.16708007 -0.16881185 -0.162410783  0.39273395
## KIDNEY_CrudePrev    -0.38074297 -0.11567382 -0.305453027 -0.14729176
## BPMED_CrudePrev     -0.05992350  0.04338464  0.006375307 -0.15078108
## CSMOKING_CrudePrev  -0.25668548 -0.17977944 -0.008194147  0.14487659
## BPHIGH_CrudePrev     0.14631682  0.48295176  0.390584986  0.18808809
## OBESITY_CrudePrev    0.02950784 -0.03225527 -0.410628122 -0.12963899
## HIGHCHOL_CrudePrev  -0.36860676 -0.03898386 -0.036720263 -0.01577031
## male_over_65_pct    -0.13471336  0.08823552  0.032653899 -0.51681016
## female_over_65_pct   0.23728386 -0.18776488 -0.211935685  0.46391065
##                            PC13        PC14         PC15
## CANCER_CrudePrev     0.02199290  0.11447485  0.130167303
## ARTHRITIS_CrudePrev -0.15251017  0.26895818  0.078774664
## STROKE_CrudePrev    -0.28876303 -0.52508602  0.134373204
## CASTHMA_CrudePrev    0.22722459 -0.14497817 -0.030962493
## COPD_CrudePrev      -0.39636904  0.05755014  0.296967872
## CHD_CrudePrev        0.46525605 -0.27770811 -0.512941666
## DIABETES_CrudePrev   0.39440097  0.09678108  0.592727304
## KIDNEY_CrudePrev    -0.23994099  0.42936186 -0.271286432
## BPMED_CrudePrev      0.03432547 -0.01059645 -0.002745499
## CSMOKING_CrudePrev   0.06213576  0.16059531  0.051847316
## BPHIGH_CrudePrev     0.12684531  0.40721317 -0.293562171
## OBESITY_CrudePrev   -0.07203648 -0.25448972 -0.019739381
## HIGHCHOL_CrudePrev  -0.07019707 -0.25693202 -0.041457898
## male_over_65_pct     0.32997346 -0.02800812  0.209073162
## female_over_65_pct  -0.33825768 -0.14276478 -0.216611910
fh_acs_scoring_by_state$pc_1 <- as.numeric(pc_by_state$x[, 1])
fh_acs_scoring_by_state$pc_2 <- as.numeric(pc_by_state$x[, 2])

fh_acs_scoring_by_state <- fh_acs_scoring_by_state %>% mutate(risk_score_pc_1=as.numeric(rescale(pc_1,to=c(0,100))))
fh_acs_scoring_by_state <- fh_acs_scoring_by_state %>% mutate(risk_score_pc_2=as.numeric(rescale(-pc_2,to=c(0,100))))
fh_acs_scoring_by_state <- fh_acs_scoring_by_state %>% mutate(risk_score_both_pc=rescale(risk_score_pc_1*.61 + risk_score_pc_2*.24, to=c(0,100))) # rescale according the the contribution of the PC
fh_acs_scoring_by_state <- fh_acs_scoring_by_state %>% mutate(risk_score_all=rowSums(.[established_cols])) %>% mutate(risk_score_all=as.numeric(rescale(risk_score_all,to=c(0,100))))


fh_acs_scoring_by_state_distribute <- fh_acs_scoring_by_state %>% dplyr::select(c(established_cols, "stateabbr", "risk_score_both_pc", "risk_score_pc_1", "risk_score_pc_2", "risk_score_all", "total_population")) 

fh_acs_scoring_by_state_distribute <- fh_acs_scoring_by_state_distribute %>% mutate(rank_in_country=rank(risk_score_both_pc)) %>% mutate(risk_score = risk_score_both_pc)

DT::datatable(fh_acs_scoring_by_state_distribute %>% dplyr::select(stateabbr, risk_score,risk_score_all))
if(WRITE_OUTPUT) {
  write_rds(fh_acs_scoring_by_state, path = "fh_acs_covid_state_comm_score.rds")
  write_csv(fh_acs_scoring_by_state_distribute, path="fh_acs_covid_state_comm_score.csv")
}